In [1]:
import numpy as np #linear algebra
import pandas as pd # data processing,CSV file I/O(e.g pd.read_csv)
import seaborn as sns # for statistical data visualization
import matplotlib.pyplot as mtp # for data visualization
import matplotlib.pyplot as plt
import plotly.express as px
%matplotlib inline
In [2]:
data = pd.read_csv(r"C:\Users\laxma\Downloads\Mall_Customers.csv")
In [3]:
data.head()
Out[3]:
CustomerID Gender Age Annual Income (k$) Spending Score (1-100)
0 1 Male 19 15 39
1 2 Male 21 15 81
2 3 Female 20 16 6
3 4 Female 23 16 77
4 5 Female 31 17 40
In [4]:
data.tail()
Out[4]:
CustomerID Gender Age Annual Income (k$) Spending Score (1-100)
195 196 Female 35 120 79
196 197 Female 45 126 28
197 198 Male 32 126 74
198 199 Male 32 137 18
199 200 Male 30 137 83
In [5]:
data.duplicated().sum()
Out[5]:
0
In [6]:
data.columns
Out[6]:
Index(['CustomerID', 'Gender', 'Age', 'Annual Income (k$)',
       'Spending Score (1-100)'],
      dtype='object')
In [7]:
#VISUALIZATION
In [8]:
plt.bar(data['Age'],data['CustomerID'])
plt.xticks(rotation=90)
plt.show()
In [9]:
fig=px.bar(data,x='Annual Income (k$)',y='CustomerID',color='Annual Income (k$)')
fig.show()
In [10]:
plt.scatter(data['Age'],data['Spending Score (1-100)'],color='cyan')
plt.xticks(rotation=90)
plt.show()
In [11]:
plt.figure(figsize=(10,4))
sns.countplot(x='Spending Score (1-100)', data=data, color='b')
plt.xticks(rotation=90)
plt.show()
In [12]:
plt.figure(figsize=(10,4))
top_car = data['Annual Income (k$)'].value_counts().nlargest(10)
sns.countplot(y=data['Annual Income (k$)'], order=top_car.index, color='red')
Out[12]:
<AxesSubplot:xlabel='count', ylabel='Annual Income (k$)'>
In [13]:
sns.lineplot(x='Age', y='Annual Income (k$)', data=data)
Out[13]:
<AxesSubplot:xlabel='Age', ylabel='Annual Income (k$)'>
In [14]:
sns.barplot(data['CustomerID'],data['Gender'],color='r')
plt.xticks(rotation=90)
plt.show()
D:\anaconda files\lib\site-packages\seaborn\_decorators.py:36: FutureWarning:

Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.

In [15]:
plt.figure(figsize=(8, 4))
sns.scatterplot(data=data, x='CustomerID', y='Spending Score (1-100)')
plt.xlabel('CustomerID')
plt.ylabel('Spending Score (1-100)')
plt.show()
In [16]:
sns.displot(data["Gender"])
Out[16]:
<seaborn.axisgrid.FacetGrid at 0x17dcab63640>
In [17]:
sns.boxplot(x='Annual Income (k$)',y='Spending Score (1-100)',data=data)
plt.xticks(rotation=90)
Out[17]:
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
        51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]),
 [Text(0, 0, '15'),
  Text(1, 0, '16'),
  Text(2, 0, '17'),
  Text(3, 0, '18'),
  Text(4, 0, '19'),
  Text(5, 0, '20'),
  Text(6, 0, '21'),
  Text(7, 0, '23'),
  Text(8, 0, '24'),
  Text(9, 0, '25'),
  Text(10, 0, '28'),
  Text(11, 0, '29'),
  Text(12, 0, '30'),
  Text(13, 0, '33'),
  Text(14, 0, '34'),
  Text(15, 0, '37'),
  Text(16, 0, '38'),
  Text(17, 0, '39'),
  Text(18, 0, '40'),
  Text(19, 0, '42'),
  Text(20, 0, '43'),
  Text(21, 0, '44'),
  Text(22, 0, '46'),
  Text(23, 0, '47'),
  Text(24, 0, '48'),
  Text(25, 0, '49'),
  Text(26, 0, '50'),
  Text(27, 0, '54'),
  Text(28, 0, '57'),
  Text(29, 0, '58'),
  Text(30, 0, '59'),
  Text(31, 0, '60'),
  Text(32, 0, '61'),
  Text(33, 0, '62'),
  Text(34, 0, '63'),
  Text(35, 0, '64'),
  Text(36, 0, '65'),
  Text(37, 0, '67'),
  Text(38, 0, '69'),
  Text(39, 0, '70'),
  Text(40, 0, '71'),
  Text(41, 0, '72'),
  Text(42, 0, '73'),
  Text(43, 0, '74'),
  Text(44, 0, '75'),
  Text(45, 0, '76'),
  Text(46, 0, '77'),
  Text(47, 0, '78'),
  Text(48, 0, '79'),
  Text(49, 0, '81'),
  Text(50, 0, '85'),
  Text(51, 0, '86'),
  Text(52, 0, '87'),
  Text(53, 0, '88'),
  Text(54, 0, '93'),
  Text(55, 0, '97'),
  Text(56, 0, '98'),
  Text(57, 0, '99'),
  Text(58, 0, '101'),
  Text(59, 0, '103'),
  Text(60, 0, '113'),
  Text(61, 0, '120'),
  Text(62, 0, '126'),
  Text(63, 0, '137')])
In [18]:
sns.violinplot(x='Gender',y='Spending Score (1-100)',data=data)
Out[18]:
<AxesSubplot:xlabel='Gender', ylabel='Spending Score (1-100)'>
In [19]:
#MODEL BUILDING
In [20]:
x = data.iloc[:,[3,4]].values
In [21]:
import scipy.cluster.hierarchy as shc
dendro = shc.dendrogram(shc.linkage(x, method='ward'))
mtp.title('Dendrogram Plot')
mtp.ylabel('Euclidean Distance')
mtp.xlabel('Customer')
mtp.show()
In [22]:
from sklearn.cluster import AgglomerativeClustering
hc = AgglomerativeClustering(n_clusters=5, affinity='euclidean',linkage='ward')
y_pred=hc.fit_predict(x)
In [23]:
y_pred
Out[23]:
array([4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3,
       4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 1,
       4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 0, 2, 0, 2,
       1, 2, 0, 2, 0, 2, 0, 2, 0, 2, 1, 2, 0, 2, 1, 2, 0, 2, 0, 2, 0, 2,
       0, 2, 0, 2, 0, 2, 1, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2,
       0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2,
       0, 2], dtype=int64)
In [24]:
mtp.scatter(x[y_pred == 0, 0], x[y_pred == 0,1], s = 100, c = 'blue', label = 'Cluster 1')
mtp.scatter(x[y_pred == 1, 0], x[y_pred == 1,1], s = 100, c = 'green', label = 'Cluster 2')
mtp.scatter(x[y_pred== 2, 0], x[y_pred == 2,1], s = 100, c = 'red', label = 'Cluster 3')
mtp.scatter(x[y_pred == 3, 0], x[y_pred == 3,1], s = 100, c = 'cyan', label = 'Cluster 4')
mtp.scatter(x[y_pred == 4, 0], x[y_pred == 4,1], s = 100, c = 'magenta', label = 'Cluster 5')
mtp.title('cluster of customer')
mtp.xlabel('Annual Income(k$)')
mtp.legend()
mtp.show()
In [ ]: